
** clean up demographics 
import delimited using health/rdrp3367_demo_de_v5.csv, clear

	** test fields 
	merge 1:1 sid using health/test_person_level, keepusing(sid nTests posTest)
	recode _merge (2 3=1) (1=0) 
	rename _merge in_test 
	replace nTests = 0 if missing(nTests)
	replace posTest = 0 if missing(posTest)

	merge 1:1 sid using health/inp_person_level, assert(1 3) 
	recode _merge (3=1) (1=0) 
	rename _merge in_inp
	
	foreach v in n_admit n_admit_dx no_dx_ever ever_icli ever_not_icli ever_clear {
		replace `v' = 0 if missing(`v')
	}
	gen inp_has_dx = n_admit_dx > 0
	label var inp_has_dx "In inpatient sample, has DX data"
	
	assert in_inp | in_test
	
	** DOB and DOD 
	bysort sid: assert _n==1
	gen dob = date( substr(date_of_birth, 1,9), "DMY")
	gen dod = date( substr(date_of_death, 1,9), "DMY")
	gen aget = (mdy(12,17,2020) - dob)/365
	count if ~inrange(aget, 0, 120)	& ~missing(aget)
	assert r(N) == 578
	replace dob = . if ~inrange(aget, 0, 120)	& ~missing(aget)
	count if ~inrange(dod, mdy(1,1,2020), mdy(12,17,20202)) & ~missing(dod)
	assert r(N) == 913
	replace dod = . if ~inrange(dod, mdy(1,1,2020), mdy(12,17,20202)) & ~missing(dod)
	
	gen yob = year(dob)
	format dob dod %td
	label var dob "Date of birth (stata date)"
	label var dod "Date of death (stata date, if died)"
	label var yob "Year of birth"
	drop date_of_birth date_of_death 

	** clean up ethnicity, race, age 
	gen male = gender == "M" 
	gen female = gender == "F" 
	gen gender_unknown = gender == "U"	

	gen hispanic = ethnicity == "HISPANIC OR LATINO"
	gen notHispanic = ethnicity == "NOT HISPANIC OR LATINO"

	gen white = race == "WHITE"
	gen black = race == "BLACK OR AFRICAN AMERICAN"
	gen raceUnknown = race == "OTHER/UNKNOWN" | missing(race)

	assert missing(zip_code)<= missing(geocode_census_block_2010)
	gen missingZip = missing(zip_code)
	gen has_zip = ~missing(zip_code)
	gen has_cb = ~missing(geocode_census_block_2010)

	gen county = floor(geocode_census_block_2010/1e10)
	replace county = . if county == 0
	assert county >=18000 & county < 19000 if ~missing(county)
	label var sid "Patient ID"
	
	** make age groups, compaarable to census reports	 (age as of 2020/1/1)	
	* < 0 ; 0-17; 18-30 ; 30-50 ;50-64 ;65-74; 75 +
	count if missing(yob)
	gen age = (mdy(1,1,2020) - dob)/365

	gen 		age_wide =  0 if age < 0
	replace age_wide = 17 if age < 18 & missing(age_wide)
	replace age_wide = 29 if age < 30 & missing(age_wide)
	replace age_wide = 49 if age < 50 & missing(age_wide)
	replace age_wide = 64 if age < 65 & missing(age_wide)
	replace age_wide = 74 if age < 75  & missing(age_wide)
	replace age_wide = 75 if ~missing(age) & missing(age_wide)	
	assert ~missing(age_wide) if ~missing(age)
	
	
	gen 		age_wide_12p = 0 if age < 12 
	replace age_wide_12p = 17 if age < 18 & missing(age_wide_12p)
	replace age_wide_12p = 29 if age < 30 & missing(age_wide_12p)
	replace age_wide_12p = 49 if age < 50 & missing(age_wide_12p)
	replace age_wide_12p = 64 if age < 65 & missing(age_wide_12p)
	replace age_wide_12p = 74 if age < 75  & missing(age_wide_12p)
	replace age_wide_12p = 75 if ~missing(age) & missing(age_wide_12p)	
	assert ~missing(age_wide_12p) if ~missing(age)
	
save health/demographics, replace 

